1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package com.sun.java.util.jar.pack;
27
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.PrintStream;
31 import java.util.Arrays;
32
33
34
35
36
37 final class Histogram {
38
39
40 protected final int[][] matrix;
41 protected final int totalWeight;
42
43
44
45 protected final int[] values;
46 protected final int[] counts;
47
48 private static final long LOW32 = (long)-1 >>> 32;
49
50
51
52
53 public
54 Histogram(int[] valueSequence) {
55 long[] hist2col = computeHistogram2Col(maybeSort(valueSequence));
56 int[][] table = makeTable(hist2col);
57 values = table[0];
58 counts = table[1];
59 this.matrix = makeMatrix(hist2col);
60 this.totalWeight = valueSequence.length;
61 assert(assertWellFormed(valueSequence));
62 }
63 public
64 Histogram(int[] valueSequence, int start, int end) {
65 this(sortedSlice(valueSequence, start, end));
66 }
67
68
69 public
70 Histogram(int[][] matrix) {
71
72 matrix = normalizeMatrix(matrix);
73 this.matrix = matrix;
74 int length = 0;
75 int weight = 0;
76 for (int i = 0; i < matrix.length; i++) {
77 int rowLength = matrix[i].length-1;
78 length += rowLength;
79 weight += matrix[i][0] * rowLength;
80 }
81 this.totalWeight = weight;
82 long[] hist2col = new long[length];
83 int fillp = 0;
84 for (int i = 0; i < matrix.length; i++) {
85 for (int j = 1; j < matrix[i].length; j++) {
86
87 hist2col[fillp++] = ((long) matrix[i][j] << 32)
88 | (LOW32 & matrix[i][0]);
89 }
90 }
91 assert(fillp == hist2col.length);
92 Arrays.sort(hist2col);
93 int[][] table = makeTable(hist2col);
94 values = table[1];
95 counts = table[0];
96 assert(assertWellFormed(null));
97 }
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 public
119 int[][] getMatrix() { return matrix; }
120
121 public
122 int getRowCount() { return matrix.length; }
123
124 public
125 int getRowFrequency(int rn) { return matrix[rn][0]; }
126
127 public
128 int getRowLength(int rn) { return matrix[rn].length-1; }
129
130 public
131 int getRowValue(int rn, int vn) { return matrix[rn][vn+1]; }
132
133 public
134 int getRowWeight(int rn) {
135 return getRowFrequency(rn) * getRowLength(rn);
136 }
137
138 public
139 int getTotalWeight() {
140 return totalWeight;
141 }
142
143 public
144 int getTotalLength() {
145 return values.length;
146 }
147
148
149 public
150 int[] getAllValues() {
151
152 return values;
153 }
154
155
156
157
158 public
159 int[] getAllFrequencies() {
160 return counts;
161 }
162
163 private static double log2 = Math.log(2);
164
165 public
166 int getFrequency(int value) {
167 int pos = Arrays.binarySearch(values, value);
168 if (pos < 0) return 0;
169 assert(values[pos] == value);
170 return counts[pos];
171 }
172
173 public
174 double getBitLength(int value) {
175 double prob = (double) getFrequency(value) / getTotalWeight();
176 return - Math.log(prob) / log2;
177 }
178
179 public
180 double getRowBitLength(int rn) {
181 double prob = (double) getRowFrequency(rn) / getTotalWeight();
182 return - Math.log(prob) / log2;
183 }
184
185 public
186 interface BitMetric {
187 public double getBitLength(int value);
188 }
189 private final BitMetric bitMetric = new BitMetric() {
190 public double getBitLength(int value) {
191 return Histogram.this.getBitLength(value);
192 }
193 };
194 public BitMetric getBitMetric() {
195 return bitMetric;
196 }
197
198
199 public
200 double getBitLength() {
201 double sum = 0;
202 for (int i = 0; i < matrix.length; i++) {
203 sum += getRowBitLength(i) * getRowWeight(i);
204 }
205 assert(0.1 > Math.abs(sum - getBitLength(bitMetric)));
206 return sum;
207 }
208
209
210 public
211 double getBitLength(BitMetric len) {
212 double sum = 0;
213 for (int i = 0; i < matrix.length; i++) {
214 for (int j = 1; j < matrix[i].length; j++) {
215 sum += matrix[i][0] * len.getBitLength(matrix[i][j]);
216 }
217 }
218 return sum;
219 }
220
221 static private
222 double round(double x, double scale) {
223 return Math.round(x * scale) / scale;
224 }
225
226
227
228
229
230 public int[][] normalizeMatrix(int[][] matrix) {
231 long[] rowMap = new long[matrix.length];
232 for (int i = 0; i < matrix.length; i++) {
233 if (matrix[i].length <= 1) continue;
234 int count = matrix[i][0];
235 if (count <= 0) continue;
236 rowMap[i] = (long) count << 32 | i;
237 }
238 Arrays.sort(rowMap);
239 int[][] newMatrix = new int[matrix.length][];
240 int prevCount = -1;
241 int fillp1 = 0;
242 int fillp2 = 0;
243 for (int i = 0; ; i++) {
244 int[] row;
245 if (i < matrix.length) {
246 long rowMapEntry = rowMap[rowMap.length-i-1];
247 if (rowMapEntry == 0) continue;
248 row = matrix[(int)rowMapEntry];
249 assert(rowMapEntry>>>32 == row[0]);
250 } else {
251 row = new int[]{ -1 };
252 }
253 if (row[0] != prevCount && fillp2 > fillp1) {
254
255 int length = 0;
256 for (int p = fillp1; p < fillp2; p++) {
257 int[] row0 = newMatrix[p];
258 assert(row0[0] == prevCount);
259 length += row0.length-1;
260 }
261 int[] row1 = new int[1+length];
262 row1[0] = prevCount;
263 int rfillp = 1;
264 for (int p = fillp1; p < fillp2; p++) {
265 int[] row0 = newMatrix[p];
266 assert(row0[0] == prevCount);
267 System.arraycopy(row0, 1, row1, rfillp, row0.length-1);
268 rfillp += row0.length-1;
269 }
270 if (!isSorted(row1, 1, true)) {
271 Arrays.sort(row1, 1, row1.length);
272 int jfillp = 2;
273
274 for (int j = 2; j < row1.length; j++) {
275 if (row1[j] != row1[j-1])
276 row1[jfillp++] = row1[j];
277 }
278 if (jfillp < row1.length) {
279
280 int[] newRow1 = new int[jfillp];
281 System.arraycopy(row1, 0, newRow1, 0, jfillp);
282 row1 = newRow1;
283 }
284 }
285 newMatrix[fillp1++] = row1;
286 fillp2 = fillp1;
287 }
288 if (i == matrix.length)
289 break;
290 prevCount = row[0];
291 newMatrix[fillp2++] = row;
292 }
293 assert(fillp1 == fillp2);
294
295 matrix = newMatrix;
296 if (fillp1 < matrix.length) {
297 newMatrix = new int[fillp1][];
298 System.arraycopy(matrix, 0, newMatrix, 0, fillp1);
299 matrix = newMatrix;
300 }
301 return matrix;
302 }
303
304 public
305 String[] getRowTitles(String name) {
306 int totalUnique = getTotalLength();
307 int ltotalWeight = getTotalWeight();
308 String[] histTitles = new String[matrix.length];
309 int cumWeight = 0;
310 int cumUnique = 0;
311 for (int i = 0; i < matrix.length; i++) {
312 int count = getRowFrequency(i);
313 int unique = getRowLength(i);
314 int weight = getRowWeight(i);
315 cumWeight += weight;
316 cumUnique += unique;
317 long wpct = ((long)cumWeight * 100 + ltotalWeight/2) / ltotalWeight;
318 long upct = ((long)cumUnique * 100 + totalUnique/2) / totalUnique;
319 double len = getRowBitLength(i);
320 assert(0.1 > Math.abs(len - getBitLength(matrix[i][1])));
321 histTitles[i] = name+"["+i+"]"
322 +" len="+round(len,10)
323 +" ("+count+"*["+unique+"])"
324 +" ("+cumWeight+":"+wpct+"%)"
325 +" ["+cumUnique+":"+upct+"%]";
326 }
327 return histTitles;
328 }
329
330
331
332 public
333 void print(PrintStream out) {
334 print("hist", out);
335 }
336
337
338
339 public
340 void print(String name, PrintStream out) {
341 print(name, getRowTitles(name), out);
342 }
343
344
345
346 public
347 void print(String name, String[] histTitles, PrintStream out) {
348 int totalUnique = getTotalLength();
349 int ltotalWeight = getTotalWeight();
350 double tlen = getBitLength();
351 double avgLen = tlen / ltotalWeight;
352 double avg = (double) ltotalWeight / totalUnique;
353 String title = (name
354 +" len="+round(tlen,10)
355 +" avgLen="+round(avgLen,10)
356 +" weight("+ltotalWeight+")"
357 +" unique["+totalUnique+"]"
358 +" avgWeight("+round(avg,100)+")");
359 if (histTitles == null) {
360 out.println(title);
361 } else {
362 out.println(title+" {");
363 StringBuffer buf = new StringBuffer();
364 for (int i = 0; i < matrix.length; i++) {
365 buf.setLength(0);
366 buf.append(" ").append(histTitles[i]).append(" {");
367 for (int j = 1; j < matrix[i].length; j++) {
368 buf.append(" ").append(matrix[i][j]);
369 }
370 buf.append(" }");
371 out.println(buf);
372 }
373 out.println("}");
374 }
375 }
376
377
378
379
380
381
382
383
384
385
386
387
388 private static
389 int[][] makeMatrix(long[] hist2col) {
390
391 Arrays.sort(hist2col);
392 int[] counts = new int[hist2col.length];
393 for (int i = 0; i < counts.length; i++) {
394 counts[i] = (int)( hist2col[i] >>> 32 );
395 }
396 long[] countHist = computeHistogram2Col(counts);
397 int[][] matrix = new int[countHist.length][];
398 int histp = 0;
399 int countp = 0;
400
401 for (int i = matrix.length; --i >= 0; ) {
402 long countAndRep = countHist[countp++];
403 int count = (int) (countAndRep);
404 int repeat = (int) (countAndRep >>> 32);
405 int[] row = new int[1+repeat];
406 row[0] = count;
407 for (int j = 0; j < repeat; j++) {
408 long countAndValue = hist2col[histp++];
409 assert(countAndValue >>> 32 == count);
410 row[1+j] = (int) countAndValue;
411 }
412 matrix[i] = row;
413 }
414 assert(histp == hist2col.length);
415 return matrix;
416 }
417
418 private static
419 int[][] makeTable(long[] hist2col) {
420 int[][] table = new int[2][hist2col.length];
421
422
423 for (int i = 0; i < hist2col.length; i++) {
424 table[0][i] = (int)( hist2col[i] );
425 table[1][i] = (int)( hist2col[i] >>> 32 );
426 }
427 return table;
428 }
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445 private static
446 long[] computeHistogram2Col(int[] sortedValues) {
447 switch (sortedValues.length) {
448 case 0:
449 return new long[]{ };
450 case 1:
451 return new long[]{ ((long)1 << 32) | (LOW32 & sortedValues[0]) };
452 }
453 long[] hist = null;
454 for (boolean sizeOnly = true; ; sizeOnly = false) {
455 int prevIndex = -1;
456 int prevValue = sortedValues[0] ^ -1;
457 int prevCount = 0;
458 for (int i = 0; i <= sortedValues.length; i++) {
459 int thisValue;
460 if (i < sortedValues.length)
461 thisValue = sortedValues[i];
462 else
463 thisValue = prevValue ^ -1;
464 if (thisValue == prevValue) {
465 prevCount += 1;
466 } else {
467
468 if (!sizeOnly && prevCount != 0) {
469
470 hist[prevIndex] = ((long)prevCount << 32)
471 | (LOW32 & prevValue);
472 }
473 prevValue = thisValue;
474 prevCount = 1;
475 prevIndex += 1;
476 }
477 }
478 if (sizeOnly) {
479
480 hist = new long[prevIndex];
481 } else {
482 break;
483 }
484 }
485 return hist;
486 }
487
488
489
490
491
492
493
494
495 private static
496 int[][] regroupHistogram(int[][] matrix, int[] groups) {
497 long oldEntries = 0;
498 for (int i = 0; i < matrix.length; i++) {
499 oldEntries += matrix[i].length-1;
500 }
501 long newEntries = 0;
502 for (int ni = 0; ni < groups.length; ni++) {
503 newEntries += groups[ni];
504 }
505 if (newEntries > oldEntries) {
506 int newlen = groups.length;
507 long ok = oldEntries;
508 for (int ni = 0; ni < groups.length; ni++) {
509 if (ok < groups[ni]) {
510 int[] newGroups = new int[ni+1];
511 System.arraycopy(groups, 0, newGroups, 0, ni+1);
512 groups = newGroups;
513 groups[ni] = (int) ok;
514 ok = 0;
515 break;
516 }
517 ok -= groups[ni];
518 }
519 } else {
520 long excess = oldEntries - newEntries;
521 int[] newGroups = new int[groups.length+1];
522 System.arraycopy(groups, 0, newGroups, 0, groups.length);
523 newGroups[groups.length] = (int) excess;
524 groups = newGroups;
525 }
526 int[][] newMatrix = new int[groups.length][];
527
528 int i = 0;
529 int jMin = 1;
530 int jMax = matrix[i].length;
531 for (int ni = 0; ni < groups.length; ni++) {
532 int groupLength = groups[ni];
533 int[] group = new int[1+groupLength];
534 long groupWeight = 0;
535 newMatrix[ni] = group;
536 int njFill = 1;
537 while (njFill < group.length) {
538 int len = group.length - njFill;
539 while (jMin == jMax) {
540 jMin = 1;
541 jMax = matrix[++i].length;
542 }
543 if (len > jMax - jMin) len = jMax - jMin;
544 groupWeight += (long) matrix[i][0] * len;
545 System.arraycopy(matrix[i], jMax - len, group, njFill, len);
546 jMax -= len;
547 njFill += len;
548 }
549 Arrays.sort(group, 1, group.length);
550
551 group[0] = (int) ((groupWeight + groupLength/2) / groupLength);
552 }
553 assert(jMin == jMax);
554 assert(i == matrix.length-1);
555 return newMatrix;
556 }
557
558 public static
559 Histogram makeByteHistogram(InputStream bytes) throws IOException {
560 byte[] buf = new byte[1<<12];
561 int[] tally = new int[1<<8];
562 for (int nr; (nr = bytes.read(buf)) > 0; ) {
563 for (int i = 0; i < nr; i++) {
564 tally[buf[i] & 0xFF] += 1;
565 }
566 }
567
568 int[][] matrix = new int[1<<8][2];
569 for (int i = 0; i < tally.length; i++) {
570 matrix[i][0] = tally[i];
571 matrix[i][1] = i;
572 }
573 return new Histogram(matrix);
574 }
575
576
577 private static
578 int[] sortedSlice(int[] valueSequence, int start, int end) {
579 if (start == 0 && end == valueSequence.length &&
580 isSorted(valueSequence, 0, false)) {
581 return valueSequence;
582 } else {
583 int[] slice = new int[end-start];
584 System.arraycopy(valueSequence, start, slice, 0, slice.length);
585 Arrays.sort(slice);
586 return slice;
587 }
588 }
589
590
591 private static
592 boolean isSorted(int[] values, int from, boolean strict) {
593 for (int i = from+1; i < values.length; i++) {
594 if (strict ? !(values[i-1] < values[i])
595 : !(values[i-1] <= values[i])) {
596 return false;
597 }
598 }
599 return true;
600 }
601
602
603 private static
604 int[] maybeSort(int[] values) {
605 if (!isSorted(values, 0, false)) {
606 values = values.clone();
607 Arrays.sort(values);
608 }
609 return values;
610 }
611
612
613
614
615 private boolean assertWellFormed(int[] valueSequence) {
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660 return true;
661 }
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818 }